import random
import time
from datetime import datetime, timedelta

import scrapy
from scrapy import Request, Selector
from scrapy.utils.project import get_project_settings

from utils.data_process import DataProcess
from wxgzh_news.api.addnews import AddNews
from wxgzh_news.api.dicts.wxnews_dict import gzh_catid_dict
from wxgzh_news.api.wxgzh import WxApi
from wxgzh_news.constant import Default_User_Agent
from wxgzh_news.items import WxgzhNewsItem


class GzhArticlesSpider(scrapy.Spider):
    name = 'gzh_articles'
    allowed_domains = ['weixin.qq.com']
    start_urls = ['http://weixin.qq.com/']

    gzh_codes = {
        "中国物流与采购联合会": ("cflp2014", "MzA3ODQ0MDEzMA=="),
        "中物联采购委": ("CFLP_SCM", "MjM5NTE5Mzk0MA=="),
        "中物联现代供应链研究院": ("cflpsca", "MzU2MDgzMDE5MA=="),
        "林度空间": ("gh_81436afc504c", "MzI3OTcyMzc2NA=="),
        "供应链架构师": ("SCM_art", "MzIzODAxMzY2MQ=="),
        "五道口供应链研究院": ("WDK-SCM", "MjM5ODc4Mzc0MA=="),
        "供应链管理专栏": ("wwwscm-blogcom", "MzUyMTQ3MTkzNA=="),
        "供应链管理云平台": ("scmyun", "MzA4MDgzNjAxMQ=="),  # count=7
        "供应链金融": ("sinoscf", "MzA3MzIxMTkyMw=="),
        "招标采购与供应链管理智库": ("gh_b3f5a7d25b0d", "MzkzOTI2MjgyNw=="),
        "采购从业者": ("costcontrol", "MjM5OTA5NDQ5Mg=="),  # 7
    }

    settings = get_project_settings()
    limit_day = settings.get("TODAY") - timedelta(days=settings.get("CRAWL_CYCLE", default=1) - 1)
    wxapi = WxApi()

    def start_requests(self):
        for gzh_name, codes in list(self.gzh_codes.items())[:]:
            fake_id = codes[1]
            url = f"https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex&begin=0&count=5&fakeid={fake_id}" \
                  f"&type=9&query=&token={self.wxapi.token}&lang=zh_CN&f=json&ajax=1"

            headers = self.wxapi.headers.copy()

            yield Request(url=url, headers=headers, cookies=self.wxapi.cookies, callback=self.parse_list,
                          dont_filter=True,
                          meta={"gzh_name": gzh_name}
                          )

            time.sleep(random.randint(1, 5) + random.random())

    def parse(self, response, **kwargs):
        pass

    def parse_list(self, response, **kwargs):
        """
        :param response:
        :param kwargs:
        :return:
        """
        try:
            resp_data = response.json()
        except Exception as e:
            return

        if resp_data['base_resp']['err_msg'] != "ok":
            return

        gzh_name = response.meta['gzh_name']
        wxpt_id = gzh_catid_dict[gzh_name]

        app_msg_list = resp_data['app_msg_list']
        for app_msg in app_msg_list:
            c_time = app_msg.get("create_time", app_msg.get("update_time"))
            addtime = datetime.fromtimestamp(c_time) if c_time else datetime.now()

            if addtime.date() < self.limit_day:
                break

            articleItem = WxgzhNewsItem(wxpt_id=wxpt_id)
            articleItem.introduce = app_msg.get("digest", "")
            articleItem.title = app_msg['title']
            articleItem.copyfrom = gzh_name

            articleItem.addtime = str(addtime)

            detail_url = app_msg['link'].rstrip("#rd")
            articleItem.fromurl = detail_url

            headers = {
                "User-Agent": Default_User_Agent,
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "zh-CN,zh-Hans;q=0.9",
                "Connection": "keep-alive",
                "Accept-Encoding": "gzip, deflate, br",
                "Host": "mp.weixin.qq.com"
            }

            yield Request(url=detail_url, headers=headers, callback=self.parse_detail, dont_filter=False,
                          meta={'item': articleItem}
                          )

    def parse_detail(self, response, **kwargs):
        """
        :param response:
        :param kwargs:
        :return:
        """
        articleItem: WxgzhNewsItem = response.meta['item']
        content = response.css("div.rich_media_content").get()

        content = content.replace("：点击上方蓝字", "").replace("：点击蓝字", "").replace("点击蓝字", "")
        content = content.replace("：关注我们", "").replace("关注我们", "")

        content = DataProcess.purify_tags(content)

        sel = Selector(text=content)
        img_srcs = sel.css("img::attr(src)").getall()
        for img_src in img_srcs:
            if not img_src.strip():
                continue
            image_url = response.urljoin(img_src)
            upload_url = AddNews.upload_pic(remote_url=image_url)
            if upload_url:
                content = content.replace(img_src, upload_url)

        articleItem.content = content

        articleItem.set_defaults(skip_time=True)
        values = articleItem.values
        yield values


